/*****************************************************************************************************************************************************/
/*  Project: Associations between risk factors and AD                                                                                                */
/*  Programmer: Longgang Zhao (lz7@email.sc.edu)                                                                *    ****                            */
/*  Date: May 16, 2002, version 1                                                                               *      *                             */
/*  Supervisor: Anwar T Merchant                                                                                *     *                              */
/*  Dataset: NHANES III (https://wwwn.cdc.gov/nchs/nhanes/nhanes3/default.aspx)                                 **** ****                            */
/*  Purpose 1: Define risk factor and find clusters uisng ML                                                                                         */
/*****************************************************************************************************************************************************/

/*------------------------------------------------------------------------------------------------------------*/
/**********************************Part I: macro and formats***************************************************/
/*------------------------------------------------------------------------------------------------------------*/

%macro getdetails(dataset,number=20);
options ls=max; proc contents data = &dataset varnum; run; proc print data = &dataset (obs = &number); run;
%mend;

%macro getlevels(dataset, var);
title "&dataset and &var"; proc sql; select count(distinct &var) as Levels, count(*) as Nobs from &dataset; quit; title;
%mend;

%macro histogram(dataset, var, format=5.2);
ods listing close;
title "&var";
ods select histogram;
proc univariate data = &dataset noprint; 
var &var;
histogram &var/vscale=count;
inset n='Sample size'(5.0) nmiss='Missing'(5.0) mean='Mean'(&format) std='Std Dev'(&format) median='Median'(&format) min='Min'(&format) max='Max'(&format);
format &var &format;
run;
title;
ods listing;
%mend;
%macro con(datain=,conlist=);
proc means min median max mean std n nmiss data = &datain; var &conlist; run;
%mend;
%macro cat(datain=,catlist=);
proc freq data = &datain; tables &catlist/norow nocol nopercent missing; run;
%mend;
%macro getmain(datain=estimate);
data estimate2; set &datain;
 label=put(substr(Effect,1,20),$20.);
 OR=put(round(OddsRatioEst,0.01),4.2)||' ('||put(round(LowerCL,0.01),4.2)||'-'||put(round(UpperCL,0.01),4.2)||')';
 keep label OR;  
run;
proc print; run;
%mend;
%macro getpvalue(datain=pvalue);
data pvalue2; set &datain;
label=put(substr(Variable,1,10),$10.);
if _n_ = 1 then delete;
if ProbChiSq < 0.001 then pvalue = "< 0.001";
 else if ProbChiSq < 0.01 then pvalue = put(round(ProbChiSq,0.001), 8.4);
  else                          pvalue = put(round(ProbChiSq,0.01), 8.3);
  keep label pvalue;
run; proc print data=pvalue2; run;
%mend;
/*------------------------------------------------------------------------------------------------------------*/
/**********************************Part IV: Univariate Analyses************************************************/
/*------------------------------------------------------------------------------------------------------------*/
libname dataloc "C:\Users\lz7\OneDrive - University of South Carolina\AD risk factors\Summer GA 2022\Data";
options fmtsearch=(dataloc);
data analyses; set dataloc.nhanes11_14;run;

%con(datain=analyses, conlist=Age bmi WC HEI SBP DBP WBC /*GBP CRP*/ TCP HDP /*LCP TGP LPP scl_phone scl_together scl_neighb scl_church scl_meeting*/ cognition);
%cat(datain=analyses, catlist=Agegp racegp sex edugp incomegp smkgp drinkgp bmigp rpagp hsDM hsCVD hsHP hsCancer hsChol /*anxiety*/ depression yvisit perios ind_missing);

proc means data=analyses n nmiss min p10 p25 p50 p75 p90 max range; var cognition; run;

proc univariate data=analyses; var cognition; histogram cognition; run;


data onlycogn; set analyses;run;

proc freq data=onlycogn; tables cogn_cat; run;

%con(datain=onlycogn, conlist=Age bmi WC HEI SBP DBP WBC /*GBP CRP*/ TCP HDP /*LCP TGP LPP scl_phone scl_together scl_neighb scl_church scl_meeting*/ cognition);

/*===========================================================*/
/*Table 1*/
%include "C:\Users\lz7\OneDrive - University of South Carolina\Tools\SASMacro\Macro_describe_perct.sas";
%describe(inputdata=onlycogn,exposure=cogn_cat,method=1,
            varlist=Age bmi WC HEI SBP DBP WBC TCP HDP,
            catlist=Agegp racegp sex edugp incomegp smkgp drinkgp bmigp rpagp hsDM hsCVD hsHP hsCancer hsChol depression yvisit perios,
              digit=0.1);

/*categorical variables*/
%macro getOR(varlist=);
%if %sysfunc(exist(baseOR)) %then %do; proc delete data=baseOR; run; %end;
%if %sysfunc(exist(basep))  %then %do; proc delete data=basep;  run; %end;
%let varnumber=1;
%do %while (%scan(&varlist.,&varnumber.) ne );
%let var=%scan(&varlist.,&varnumber.);
proc logistic data = onlycogn;
 class Agegp(ref="60-<70") racegp(ref="white") sex(ref="Male") edugp(ref="1_<12 yrs") incomegp(ref="LOW_PIR=<1.3") 
       smkgp(ref="NonSmoker") drinkgp(ref="NonDrinker") bmigp(ref="NORMAL") rpagp(ref="1") hsChol(ref="NO")
       hsDM(ref="NO") hsCVD(ref="NO") hsHP(ref="NO") hsCancer(ref="NO") hsChol(ref="NO") depression(ref="NO") 
       yvisit(ref="NO") perios(ref="No disease") cogn_cat(ref="NO")/param=ref;
 model cogn_cat (ref="NO") = &var;
 ods output OddsRatios=estimate ParameterEstimates=pvalue;
run;
data estimate2; set estimate;
 label=put(substr(Effect,1,20),$20.);
 OR=put(round(OddsRatioEst,0.01),4.2)||' ('||put(round(LowerCL,0.01),4.2)||'-'||put(round(UpperCL,0.01),4.2)||')';
 keep label OR;  
run;
data pvalue2; set pvalue;
label=put(substr(Variable,1,10),$10.);
if _n_ = 1 then delete;
if ProbChiSq < 0.001 then pvalue = "< 0.001";
 else if ProbChiSq < 0.01 then pvalue = put(round(ProbChiSq,0.001), 8.4);
  else                          pvalue = put(round(ProbChiSq,0.01), 8.3);
  keep label pvalue;
run;
%let varnumber=%eval(&varnumber.+1);
proc append base=baseOR data=estimate2 force;run;
proc append base=basep  data=pvalue2   force;run;
%end;
%mend;

%getOR(varlist=Agegp racegp sex edugp incomegp smkgp drinkgp bmigp rpagp hsDM hsCVD hsHP hsCancer hsChol depression yvisit perios); 
proc print data=baseOR; run; proc print data=basep; run;

/*continuous variables*/
%macro getOR(unit=,varlist=);
%if %sysfunc(exist(baseOR)) %then %do; proc delete data=baseOR; run; %end;
%if %sysfunc(exist(basep))  %then %do; proc delete data=basep;  run; %end;
%let varnumber=1;
%do %while (%scan(&varlist.,&varnumber.) ne );
%let var=%scan(&varlist.,&varnumber.);
proc logistic data = onlycogn;
 class Agegp(ref="60-<70") racegp(ref="white") sex(ref="Male") edugp(ref="1_<12 yrs") incomegp(ref="LOW_PIR=<1.3") 
       smkgp(ref="NonSmoker") drinkgp(ref="NonDrinker") bmigp(ref="NORMAL") rpagp(ref="1") hsChol(ref="NO")
       hsDM(ref="NO") hsCVD(ref="NO") hsHP(ref="NO") hsCancer(ref="NO") hsChol(ref="NO") depression(ref="NO") 
       yvisit(ref="NO") perios(ref="No disease") cogn_cat(ref="NO")/param=ref;
 model cogn_cat (ref="NO") = &var/clodds=wald;
 units &var=&unit;
 ods output  CLoddsWald=estimate ParameterEstimates=pvalue;
run;
data estimate2; set estimate;
 label=put(substr(Effect,1,20),$20.);
 OR=put(round(OddsRatioEst,0.01),4.2)||' ('||put(round(LowerCL,0.01),4.2)||'-'||put(round(UpperCL,0.01),4.2)||')';
 keep label OR Unit;  
run;
data pvalue2; set pvalue;
label=put(substr(Variable,1,10),$10.);
if _n_ = 1 then delete;
if ProbChiSq < 0.001 then pvalue = "< 0.001";
 else if ProbChiSq < 0.01 then pvalue = put(round(ProbChiSq,0.001), 8.4);
  else                          pvalue = put(round(ProbChiSq,0.01), 8.3);
  keep label pvalue;
run;
%let varnumber=%eval(&varnumber.+1);
proc append base=baseOR data=estimate2 force;run;
proc append base=basep  data=pvalue2   force;run;
%end;
%mend;
%getOR(unit=5,varlist=Age bmi WC HEI); proc print data=baseOR; run; proc print data=basep; run;

%getOR(unit=10,varlist=SBP DBP); proc print data=baseOR; run; proc print data=basep; run;

%getOR(unit=SD,varlist=WBC TCP HDP); proc print data=baseOR; run; proc print data=basep; run;



/*MV model*/
proc logistic data = onlycogn;
 class Agegp(ref="60-<70") racegp(ref="white") sex(ref="Male") edugp(ref="1_<12 yrs") incomegp(ref="LOW_PIR=<1.3") 
       smkgp(ref="NonSmoker") drinkgp(ref="NonDrinker") bmigp(ref="NORMAL") rpagp(ref="1") hsChol(ref="NO")
       hsDM(ref="NO") hsCVD(ref="NO") hsHP(ref="NO") hsCancer(ref="NO") hsChol(ref="NO") depression(ref="NO") 
       yvisit(ref="NO") perios(ref="No disease") cogn_cat(ref="NO")/param=ref;
 model cogn_cat (ref="NO") = Age bmi WC HEI SBP DBP WBC TCP HDP
                             racegp sex edugp incomegp smkgp drinkgp rpagp hsDM hsCVD hsHP hsCancer hsChol depression yvisit perios/clodds=wald;
 units Age=5 bmi=5 WC=5 HEI=5 SBP=10 DBP=10 WBC=SD TCP=SD HDP=SD;
ods output  CLoddsWald=estimate ParameterEstimates=pvalue;
run;
data estimate2; set estimate;
 label=put(substr(Effect,1,20),$20.);
 OR=put(round(OddsRatioEst,0.01),4.2)||' ('||put(round(LowerCL,0.01),4.2)||'-'||put(round(UpperCL,0.01),4.2)||')';
 keep label OR units;  
run;
data pvalue2; set pvalue;
label=put(substr(Variable,1,10),$10.);
if _n_ = 1 then delete;
if ProbChiSq < 0.001 then pvalue = "< 0.001";
 else if ProbChiSq < 0.01 then pvalue = put(round(ProbChiSq,0.001), 8.4);
  else                          pvalue = put(round(ProbChiSq,0.01), 8.3);
  keep label pvalue;
run;
proc print data=estimate2; run; proc print data=pvalue2; run;

/*Scoring details based on model with AIC=2112.6
Age ---------------------------   0.057
Waist circumference -----------  -0.017 
HEI ---------------------------  -0.012
Social connect with phone -----  -0.020
Social connect with church ----  -0.002
Race with others --------------   0.660
Race with black ---------------   0.416
Education ---------------------  -0.710
Income high -------------------  -0.875
Income middle -----------------  -0.325
Physical activity moderately --  -0.172
Physical activity vigorously --   0.197
Diabetes ----------------------   0.324
Hypercholesterolemia ----------  -0.422
Annual visit of dentists ------  -0.542
************************************************/

data nomissing; set onlycogn;
s_age        =  0.0057*Age;
s_WC         = -0.017*WC;
s_HEI        = -0.012*HEI;

s_race = 0;
if racegp = 3 then s_race = 0.660;
 else if racegp = 2 then s_race = 0.416;

s_edu = 0;
if edugp = 2 then s_edu = -0.710;

s_income = 0;
if incomegp = 3 then s_income = -0.875;
 else if incomegp = 2 then s_income = -0.325;

s_pa = 0;
if rpagp = 2 then s_pa = -0.172;
 else if rpagp = 1 then s_pa = 0.197;

 s_dm = 0;
 if hsDM = 1 then s_dm = 0.324;

 s_hc = 0;
 if hsChol = 1 then s_hc = -0.422;

 s_yvisit = 0;
 if yvisit = 1 then s_yvisit = -0.542;

 riskscore = sum (s_age,s_WC,s_race, s_edu, s_income, s_pa, s_dm, s_hc, s_yvisit);
 run;

proc univariate data=nomissing;
  var riskscore;
  histogram;
run;

proc rank data=nomissing out=nomissing group=3;
  var riskscore; ranks riskscoret;
run;

proc means data=nomissing min max median; var riskscore; class riskscoret; run;

proc freq data=nomissing;
  tables riskscoret*cogn_cat/nocol nopercent;
run;

proc logistic data = nomissing;
 class cogn_cat(ref="NO") riskscoret(ref='0')/param=ref;
 model cogn_cat (ref="NO") = riskscoret/clodds=wald;
 ods output  CLoddsWald=estimate ParameterEstimates=pvalue;
run;
%getmain; %getpvalue;
proc logistic data = nomissing;
 class cogn_cat(ref="NO") riskscoret(ref='0') racegp(ref="white") sex(ref="Male") /param=ref;
 model cogn_cat (ref="NO") = riskscoret Age racegp sex/clodds=wald;
 ods output  CLoddsWald=estimate ParameterEstimates=pvalue;
run;
%getmain; %getpvalue;



/*ROC curve*/
proc logistic data = nomissing;
 class cogn_cat(ref="NO") /param=ref;
 model cogn_cat (ref="NO") = riskscore/clodds=wald;
 output out=LogiOut predicted=crudem;
run;
proc logistic data = LogiOut;
 class cogn_cat(ref="NO") racegp(ref="white") sex(ref="Male") /param=ref;
 model cogn_cat (ref="NO") = riskscore Age racegp sex/clodds=wald;
 output out=LogiOut predicted=adjustedm;
run;
proc logistic data = LogiOut;
 class Agegp(ref="60-<70") racegp(ref="white") sex(ref="Male") edugp(ref="1_<12 yrs") incomegp(ref="LOW_PIR=<1.3") 
       smkgp(ref="NonSmoker") drinkgp(ref="NonDrinker") bmigp(ref="NORMAL") rpagp(ref="1") hsChol(ref="NO")
       hsDM(ref="NO") hsCVD(ref="NO") hsHP(ref="NO") hsCancer(ref="NO") hsChol(ref="NO") depression(ref="NO") 
       yvisit(ref="NO") perios(ref="No disease") cogn_cat(ref="NO")/param=ref;
 model cogn_cat (ref="NO") = riskscore Age racegp sex edugp hsCVD WBC depression/clodds=wald;
 output out=LogiOut predicted=covariatem;
run;

proc logistic data = LogiOut;
model cogn_cat (ref="NO") = crudem adjustedm covariatem/nofit;
roc "Crude model"  pred = crudem;
roc "Adjusted model" pred = adjustedm;
roc "Covariates model" pred= covariatem;
ods select ROCOverlay;
run;


/*crude score*/

data nomissing; set onlycogn;
s_age        =  6*Age/10;
s_WC         = 20*WC/10;
s_HEI        = 10*HEI/10;

s_race = 0;
if racegp = 3 then s_race = 70;
 else if racegp = 2 then s_race = 40;

s_edu = 0;
if edugp = 2 then s_edu = -70;

s_income = 0;
if incomegp = 3 then s_income = -90;
 else if incomegp = 2 then s_income = -30;

s_pa = 0;
if rpagp = 2 then s_pa = -20;
 else if rpagp = 1 then s_pa = 20;

 s_dm = 0;
 if hsDM = 1 then s_dm = 30;

 s_hc = 0;
 if hsChol = 1 then s_hc = -40;

 s_yvisit = 0;
 if yvisit = 1 then s_yvisit = -50;

 riskscore = sum (s_age,s_WC,s_race, s_edu, s_income, s_pa, s_dm, s_hc, s_yvisit);
 run;

proc univariate data=nomissing;
  var riskscore;
  histogram;
run;

proc rank data=nomissing out=nomissing group=3;
  var riskscore; ranks riskscoret;
run;

proc means data=nomissing min max median; var riskscore; class riskscoret; run;

proc freq data=nomissing;
  tables riskscoret*cogn_cat/nocol nopercent;
run;

proc logistic data = nomissing;
 class cogn_cat(ref="NO") riskscoret(ref='0')/param=ref;
 model cogn_cat (ref="NO") = riskscoret/clodds=wald;
 ods output  CLoddsWald=estimate ParameterEstimates=pvalue;
run;
%getmain; %getpvalue;
proc logistic data = nomissing;
 class cogn_cat(ref="NO") riskscoret(ref='0') racegp(ref="white") sex(ref="Male") /param=ref;
 model cogn_cat (ref="NO") = riskscoret Age racegp sex/clodds=wald;
 ods output  CLoddsWald=estimate ParameterEstimates=pvalue;
run;
%getmain; %getpvalue;

/*ROC curve*/
proc logistic data = nomissing;
 class cogn_cat(ref="NO") /param=ref;
 model cogn_cat (ref="NO") = riskscore/clodds=wald;
 output out=LogiOut predicted=crudem;
run;
proc logistic data = LogiOut;
 class cogn_cat(ref="NO") racegp(ref="white") sex(ref="Male") /param=ref;
 model cogn_cat (ref="NO") = riskscore Age racegp sex/clodds=wald;
 output out=LogiOut predicted=adjustedm;
run;
proc logistic data = LogiOut;
 class Agegp(ref="60-<70") racegp(ref="white") sex(ref="Male") edugp(ref="1_<12 yrs") incomegp(ref="LOW_PIR=<1.3") 
       smkgp(ref="NonSmoker") drinkgp(ref="NonDrinker") bmigp(ref="NORMAL") rpagp(ref="1") hsChol(ref="NO")
       hsDM(ref="NO") hsCVD(ref="NO") hsHP(ref="NO") hsCancer(ref="NO") hsChol(ref="NO") depression(ref="NO") 
       yvisit(ref="NO") perios(ref="No disease") cogn_cat(ref="NO")/param=ref;
 model cogn_cat (ref="NO") = riskscore Age racegp sex edugp hsCVD WBC depression/clodds=wald;
 output out=LogiOut predicted=covariatem;
run;

proc logistic data = LogiOut;
model cogn_cat (ref="NO") = crudem adjustedm covariatem/nofit;
roc "Crude model"  pred = crudem;
roc "Adjusted model" pred = adjustedm;
roc "Covariates model" pred= covariatem;
ods select ROCOverlay;
run;

/*get cut points*/
proc logistic data = nomissing;
 class cogn_cat(ref="NO") racegp(ref="white") sex(ref="Male") /param=ref;
 model cogn_cat (ref="NO") = riskscore Age racegp sex/clodds=wald outroc=rocstats;
run;

data check; set rocstats;
_SPECIF_ = (1 - _1MSPEC_);
J = _SENSIT_ + _SPECIF_ - 1;
D= Sqrt((1-_SENSIT_)**2 + (1-_SPECIF_)**2);
run;

proc sql noprint; create table cutoff1 as select _PROB_ , J from check having J = max(J); quit; proc print data=cutoff1; run;
proc sql noprint; create table cutoff2 as select _PROB_ , D from check having D = min(D); quit; proc print data=cutoff2; run;

proc print data=check; where J=0.39607; run;


/*------------------------------------------------------------------------------------------------------------*/
/**********************************Part VI: SVM Risk factor score   *******************************************/
/*------------------------------------------------------------------------------------------------------------*/
/*
proc logistic data = nomissing;
 class Agegp(ref="40-<65") racegp(ref="white") sex(ref="Male") edugp(ref="1_<12 yrs") incomegp(ref="LOW_PIR=<1.3") 
       smkgp(ref="NonSmoker") drinkgp(ref="NonDrinker") bmigp(ref="NORMAL") rpagp(ref="Sedentary") hsChol(ref="NO")
       diabetes(ref="NO") hsCVD(ref="NO") HP(ref="NO") Cancer(ref="NO") HC(ref="NO") anxiety(ref="NO") depression(ref="NO") 
       yvisit(ref="NO") perios(ref="No disease") cogn_cat(ref="NO")/param=ref;
 model cogn_cat (ref="NO") = Age WC HEI DBP WBC HDP racegp edugp incomegp rpagp depression yvisit/
                             clodds=wald;
run;
/*
/*Scoring details based on model: values from the NHANES III
Age ---------------------------   0.057
Waist circumference -----------  -0.016 
HEI ---------------------------  -0.015
DBP ---------------------------  -0.006
WBC ---------------------------  -0.026
HDP ---------------------------  -0.0004
Race with others --------------   0.734
Race with black ---------------   0.367
Education ---------------------  -0.756
Income high -------------------  -0.861
Income middle -----------------  -0.301
Physical activity moderately --  -0.157
Physical activity vigorously --   0.212
Depression --------------------   0.342
Annual visit of dentists ------  -0.601
************************************************/

data nomissing; set onlycogn;
s_age        =  0.057*Age;
s_WC         = -0.016*WC;
s_HEI        = -0.015*HEI;
s_DBP        = -0.006*DBP;
s_WBC        = -0.026*WBC;
s_HDP        =-0.0004*HDP;

s_race = 0;
if racegp = 3 then s_race = 0.734;
 else if racegp = 2 then s_race = 0.367;

s_edu = 0;
if edugp = 2 then s_edu = -0.756;

s_income = 0;
if incomegp = 3 then s_income = -0.861;
 else if incomegp = 2 then s_income = -0.301;

s_pa = 0;
if rpagp = 2 then s_pa = -0.157;
 else if rpagp = 1 then s_pa = 0.212;

 s_de = 0;
 if depression = 1 then s_de = 0.342;

 s_yvisit = 0;
 if yvisit = 1 then s_yvisit = -0.601;

 riskscore = sum (s_age, s_WC, s_HEI, s_DBP, s_WBC, s_HDP, s_race, s_edu, s_income, s_pa, s_de, s_yvisit);
 run;

proc univariate data=nomissing;
  var riskscore;
  histogram;
run;

proc rank data=nomissing out=nomissing group=3;
  var riskscore; ranks riskscoret;
run;

proc freq data=nomissing;
  tables riskscoret*cogn_cat/nocol nopercent;
run;

proc logistic data = nomissing;
 class cogn_cat(ref="NO") riskscoret(ref='0')/param=ref;
 model cogn_cat (ref="NO") = riskscoret/clodds=wald;
 ods output  CLoddsWald=estimate ParameterEstimates=pvalue;
run;
%getmain; %getpvalue;
proc logistic data = nomissing;
 class cogn_cat(ref="NO") riskscoret(ref='0') racegp(ref="white") sex(ref="Male") /param=ref;
 model cogn_cat (ref="NO") = riskscoret Age racegp sex/clodds=wald;
 ods output  CLoddsWald=estimate ParameterEstimates=pvalue;
run;
%getmain; %getpvalue;

proc logistic data = nomissing;
 class cogn_cat(ref="NO") /param=ref;
 model cogn_cat (ref="NO") = riskscore/clodds=wald;
 output out=LogiOut predicted=crudem;
run;
proc logistic data = LogiOut;
 class cogn_cat(ref="NO") racegp(ref="white") sex(ref="Male") /param=ref;
 model cogn_cat (ref="NO") = riskscore Age racegp sex/clodds=wald;
 output out=LogiOut predicted=adjustedm;
run;
proc logistic data = LogiOut;
 class Agegp(ref="60-<70") racegp(ref="white") sex(ref="Male") edugp(ref="1_<12 yrs") incomegp(ref="LOW_PIR=<1.3") 
       smkgp(ref="NonSmoker") drinkgp(ref="NonDrinker") bmigp(ref="NORMAL") rpagp(ref="1") hsChol(ref="NO")
       hsDM(ref="NO") hsCVD(ref="NO") hsHP(ref="NO") hsCancer(ref="NO") hsChol(ref="NO") depression(ref="NO") 
       yvisit(ref="NO") perios(ref="No disease") cogn_cat(ref="NO")/param=ref;
 model cogn_cat (ref="NO") = riskscore Age racegp sex edugp hsCVD WBC depression/clodds=wald;
 output out=LogiOut predicted=covariatem;
run;

proc logistic data = LogiOut;
model cogn_cat (ref="NO") = crudem adjustedm covariatem/nofit;
roc "Crude model"  pred = crudem;
roc "Adjusted model" pred = adjustedm;
roc "Covariates model" pred= covariatem;
ods select ROCOverlay;
run;
